library(ggplot2)
library(dplyr)
library(reshape2)
library(fst)
library(haven)

load("main_data.rdata")

# create bars per year for local elections  

inc_1985 <- 
  read.fst("ind1985.fst")

inc_1986 <- 
  read.fst("ind1986.fst")

inc_1987 <- 
  read.fst("ind1987.fst")

bef_father <-
  read.fst("bef1986.fst") %>% 
  select("PNR", "KOEN", "ALDER")

# Average over income 1985-1987

inc <- 
  left_join(inc_1985 %>% select(c("PNR", "SAMLINK_NY")), 
            inc_1986 %>% select(c("PNR", "SAMLINK_NY")), 
            by = "PNR") %>% 
  left_join(., inc_1987 %>% select(c("PNR", "SAMLINK_NY")),
            by = "PNR") %>% 
  mutate(SAMLINK_NY = rowMeans(cbind(SAMLINK_NY.x, SAMLINK_NY.y, SAMLINK_NY), na.rm = TRUE)) %>% 
  select(c("PNR", SAMLINK_NY))

inc <- 
  left_join(inc, bef_father, by = "PNR")

inc <-
  inc %>%
  filter(KOEN == 1 & ALDER > 17) %>% 
  group_by(KOEN, ALDER) %>% 
  mutate(inc_tile = ntile(SAMLINK_NY, 20)) 


var <- c("run_kv", "elected_kv")
lab <- c("Running for \nmunicipality", 
         "Elected for \nmunicipality")

data_out   <- data_frame()
ttest_data <- tibble()

for(k in seq(1993, 2013, by = 4)){
  for(j in 1:2){
    
    bef <- 
      read.fst(paste("bef", k, ".fst", sep = "")) %>% 
      select("PNR", "FAR_ID", "KOM")
    
    data_year <- 
      data_comp %>% 
      filter(four_years == k) %>% 
      filter(.[,paste(var[j],"_", k, sep = "")] == 1) %>% 
      left_join(., bef, by = "PNR") %>% 
      left_join(., inc, by = c("FAR_ID" = "PNR")) 
    
    data_out_temp <-
      data_frame(inc_tile = unique(data_year$inc_tile),
                 year = k,
                 lab  = lab[j]) %>%
      left_join(., data_year %>% 
                  group_by(inc_tile) %>% 
                  summarise(parent_margin = n()),
                by = "inc_tile")
    
    ttest_data_k <- 
      data_year %>% 
      summarise(mean = mean(inc_tile, na.rm = TRUE), 
                se = sqrt(var(inc_tile, na.rm = TRUE)/sum(!is.na(inc_tile)))) %>% 
      mutate(year = k,
             level = "Local council",
             place = j,
             comparison = lab[j]) 
    
    data_out <-
      bind_rows(data_out, data_out_temp)  
    
    ttest_data <- 
      bind_rows(ttest_data, ttest_data_k)
  }
  print(k)
  print(Sys.time())
}


var <- c("run_fv", "elected_fv")
lab <- c("Running for \nparliament", 
         "Elected for \nparliament")


for(k in c(1990, 1994, 1998, 2001, 2005, 2007, 2011, 2015)){
  for(j in 1:2){
    
    bef <- 
      read.fst(paste("bef", k, ".fst", sep = "")) %>% 
      select("PNR", "FAR_ID", "KOM")
    
    if(j == 1){
      data_year <- 
        data_comp %>% 
        filter(four_years == k) %>% 
        filter(.[,paste(var[j],"_", k, "_FV", sep = "")] == 1) %>% 
        left_join(., bef, by = "PNR") %>% 
        left_join(., inc, by = c("FAR_ID" = "PNR")) 
    }
    if(j == 2){
      data_year <- 
        data_comp %>% 
        filter(four_years == k) %>% 
        filter(.[,paste(var[j],"_", k, sep = "")] == 1) %>% 
        left_join(., bef, by = "PNR") %>% 
        left_join(., inc, by = c("FAR_ID" = "PNR"))
    }
    data_out_temp <-
      data_frame(inc_tile = unique(data_year$inc_tile),
                 year = k,
                 lab  = lab[j]) %>%
      left_join(., data_year %>% 
                  group_by(inc_tile) %>% 
                  summarise(parent_margin = n()),
                by = "inc_tile")
    
    ttest_data_k <- 
      data_year %>% 
      summarise(mean = mean(inc_tile, na.rm = TRUE), 
                se = sqrt(var(inc_tile, na.rm = TRUE)/sum(!is.na(inc_tile)))) %>% 
      mutate(year = k,
             level = "Parliament",
             place = j,
             comparison = lab[j]) 
    
    data_out <-
      bind_rows(data_out, data_out_temp)  
    
    ttest_data <- 
      bind_rows(ttest_data, ttest_data_k)
  }
  
  
  
  print(k)
  print(Sys.time())
}

data_out <-
  data_out %>% 
  filter(!is.na(inc_tile))

plot_data <- 
  data_out %>% 
  group_by(inc_tile, lab) %>% 
  summarise(sum_inc = sum(parent_margin)) %>% 
  group_by(lab) %>%
  mutate(share = sum_inc/sum(sum_inc)) %>% 
  ungroup() %>%
  mutate(lab = as.factor(lab),
         lab = factor(lab, levels = levels(lab)[c(3, 1, 4, 2)]))


plot <- 
  ggplot(data = plot_data,
         aes(x = inc_tile,
             y = share)) + 
  geom_bar(stat = "identity") + 
  facet_grid(lab ~ .) +
  theme_classic() + 
  scale_x_continuous("Percentile", 
                     labels = c("0%", "25%", "50%", "75%", "100%")) + 
  geom_hline(yintercept = 0.05, linetype = "dashed", alpha = 0.5) + 
  scale_y_continuous("") +
  theme(panel.spacing = unit(1, "lines")) 

last_plot()

ggsave("fathers_inctile.pdf")
       

# Add representation based on education for comparison

lab <- c("ba", "ma", "phd")

for(k in seq(1993, 2013, by = 4)){
  edu <- 
    read.fst(paste0("udda_recoded", k, ".fst"))
  
  edu40 <- 
    read.fst(paste0("bef", k, ".fst")) %>% 
    filter(ALDER == 48) %>%  # compare to people with age = mean(age_politicians)
    left_join(., edu) %>%
    select("PNR", "FAR_ID", "KOM", "education")
  
  
  edu40 <-
    edu40 %>% 
    mutate(ba = NA,
           ba  = case_when(education == "Bachelor" ~ 1,
                           education == "Lange videreg?ende uddannelser" ~ 1,
                           education == "Mellemlange videreg?ende uddannelser" ~ 1,
                           education == "Forskeruddannelser" ~ 1,
                           is.na(ba)  ~ 0),
           ma = NA,
           ma  = case_when(education == "Lange videreg?ende uddannelser" ~ 1,
                           education == "Forskeruddannelser" ~ 1,
                           is.na(ma)  ~ 0),
           phd = NA,
           phd = case_when(education == "Forskeruddannelser" ~ 1,
                           is.na(phd)  ~ 0))
  
  data_year <- 
    data_comp %>% 
    filter(four_years == k) %>% 
    left_join(., edu40, by = "PNR") %>% 
    left_join(., inc, by = c("FAR_ID" = "PNR")) %>% 
    filter(!is.na(ba) & !is.na(PNR))
  
  for(j in 1:3){
    
    if(j == 1){
      data_year_edu <-
        filter(data_year, ba == 1)
    }  
    
    if(j == 2){
      data_year_edu <-
        filter(data_year, ma == 1)
    }  
    
    if(j == 3){
      data_year_edu <-
        filter(data_year, phd == 1)
    }    
    
    
    ttest_data_k <- 
      data_year_edu %>% 
      summarise(mean = mean(inc_tile, na.rm = TRUE), 
                se = sqrt(var(inc_tile, na.rm = TRUE)/sum(!is.na(inc_tile)))) %>% 
      mutate(year = k,
             level = "Local council",
             place = j+2,
             comparison = lab[j]) 
    
    ttest_data <- 
      bind_rows(ttest_data, ttest_data_k)
  }
  print(k)
  print(Sys.time())
}

for(k in c(1990, 1994, 1998, 2001, 2005, 2007, 2011, 2015)){
  
  edu <- 
    read.fst(paste0("udda_recoded", k, ".fst"))
  
  edu40 <- 
    read.fst(paste0("bef", k, ".fst")) %>% 
    filter(ALDER == 45) %>%  # compare to people with age = mean(age_politicians)
    left_join(., edu) %>%
    select("PNR", "FAR_ID", "KOM", "education")
  
  
  edu40 <-
    edu40 %>% 
    mutate(ba = NA,
           ba  = case_when(education == "Bachelor" ~ 1,
                           education == "Lange videreg?ende uddannelser" ~ 1,
                           education == "Mellemlange videreg?ende uddannelser" ~ 1,
                           education == "Forskeruddannelser" ~ 1,
                           is.na(ba)  ~ 0),
           ma = NA,
           ma  = case_when(education == "Lange videreg?ende uddannelser" ~ 1,
                           education == "Forskeruddannelser" ~ 1,
                           is.na(ma)  ~ 0),
           phd = NA,
           phd = case_when(education == "Forskeruddannelser" ~ 1,
                           is.na(phd)  ~ 0))
  
  data_year <- 
    data_comp %>% 
    filter(four_years == k) %>% 
    left_join(., edu40, by = "PNR") %>% 
    left_join(., inc, by = c("FAR_ID" = "PNR")) %>% 
    filter(!is.na(ba) & !is.na(PNR))
  
  
  for(j in 1:3){
    
    if(j == 1){
      data_year_edu <-
        filter(data_year, ba == 1)
    }  
    
    if(j == 2){
      data_year_edu <-
        filter(data_year, ma == 1)
    }  
    
    if(j == 3){
      data_year_edu <-
        filter(data_year, phd == 1)
    }    
    
    
    ttest_data_k <- 
      data_year_edu %>% 
      summarise(mean = mean(inc_tile, na.rm = TRUE), 
                se = sqrt(var(inc_tile, na.rm = TRUE)/sum(!is.na(inc_tile)))) %>% 
      mutate(year = k,
             level = "Parliament",
             place = j+2,
             comparison = lab[j]) 
    
    ttest_data <- 
      bind_rows(ttest_data, ttest_data_k)
  }
  print(k)
  print(Sys.time())
}

## Plot means for each year 

## rescale 

ttest_data <-
  ttest_data %>% 
  mutate(mean  = 100/19 * (mean - 1),
         se    = 100/19 * se)

# Too few PhDs prior to 1998: drop for discretion

ttest_data <- 
  filter(ttest_data, !(comparison == "phd" & year < 1998))
  

plot_ttest_local <- 
  ggplot(data = ttest_data %>% filter(level == "Local council"),
         aes(x    = mean,
             xmin = mean + qnorm(0.025) * se,
             xmax = mean + qnorm(0.975) * se,
             y    = place)) +
  geom_errorbarh(height = 0, size = 1) +
  geom_point(size = 2) +
  facet_wrap(. ~ year, ncol = 3) +
  scale_y_continuous("", breaks = 1:5, labels = c("Running for \nmunicipality",
                                                  "Elected for \nmunicipality",
                                                  "Bachelor's \ndegree",
                                                  "Master's \ndegree", "PhD")) +
  scale_x_continuous("") +
  theme_classic() +
  geom_vline(xintercept = 50, alpha = 0.5, linetype = "dashed")

last_plot()

plot_ttest_national <- 
  ggplot(data = ttest_data %>% filter(level == "Parliament"),
         aes(x    = mean,
             xmin = mean + qnorm(0.025) * se,
             xmax = mean + qnorm(0.975) * se,
             y    = place)) +
  geom_errorbarh(height = 0, size = 1) +
  geom_point(size = 2) +
  facet_wrap(. ~ year, ncol = 3) +
  scale_y_continuous("", breaks = 1:5, labels = c("Running for \nparliament",
                                                  "Elected for \nparliament",
                                                  "Bachelor's \ndegree",
                                                  "Master's \ndegree", "PhD")) +  scale_x_continuous("") +
  theme_classic() +
  geom_vline(xintercept = 50, alpha = 0.5, linetype = "dashed")

last_plot()

